{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import re, sys, os, csv\n",
    "from many_stop_words import get_stop_words\n",
    "from nltk.corpus import stopwords\n",
    "from nltk.tokenize import word_tokenize\n",
    "from collections import Counter"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "#stop_words = list(get_stop_words('en'))         #About 900 stop words\n",
    "#nltk_words = list(stopwords.words('english'))   #About 150 stop words\n",
    "#stop_words.extend(nltk_words)\n",
    "\n",
    "def word_prob(word): return dictionary[word] / total\n",
    "def words(text): return re.findall('[a-z]+', text.lower())\n",
    "dictionary = Counter(words(open('dataset/wordlists/merged.txt').read()))\n",
    "max_word_length = max(map(len, dictionary))\n",
    "total = float(sum(dictionary.values()))\n",
    "\n",
    "def viterbi_segment(text):\n",
    "    probs, lasts = [1.0], [0]\n",
    "    for i in range(1, len(text) + 1):\n",
    "        prob_k, k = max((probs[j] * word_prob(text[j:i]), j)\n",
    "                        for j in range(max(0, i - max_word_length), i))\n",
    "        probs.append(prob_k)\n",
    "        lasts.append(k)\n",
    "    words = []\n",
    "    i = len(text)\n",
    "    while 0 < i:\n",
    "        words.append(text[lasts[i]:i])\n",
    "        i = lasts[i]\n",
    "    words.reverse()\n",
    "    return words, probs[-1]\n",
    "\n",
    "def fix_hashtag(text):\n",
    "    text = text.group().split(\":\")[0]\n",
    "    text = text[1:] # remove '#'\n",
    "    try:\n",
    "        test = int(text[0])\n",
    "        text = text[1:]\n",
    "    except:\n",
    "        pass\n",
    "    output = ' '.join(viterbi_segment(text)[0])\n",
    "    #print(output)\n",
    "    return output\n",
    "\n",
    "def clean_tweet( tweet):\n",
    "        tweet = tweet.lower()\n",
    "        tweet = re.sub(\"(#[A-Za-z0-9]+)\", fix_hashtag, tweet)\n",
    "        return ' '.join(re.sub(\"(@[A-Za-z0-9]+)|([^0-9A-Za-z \\t])|(\\w+:\\/\\/\\S+)\", \" \", tweet).split())\n",
    "\n",
    "def remove_stopwords(word_list):\n",
    "        filtered_tweet=\"\"\n",
    "        for word in word_list:\n",
    "            word = word.lower() \n",
    "            if word not in stopwords.words(\"english\"):\n",
    "                filtered_tweet=filtered_tweet + \" \" + word\n",
    "        \n",
    "        return filtered_tweet.lstrip()\n",
    "    \n",
    "def vectorise_label(label):\n",
    "    if label == \"empty\":return 1 # neutral\n",
    "    elif label == \"sadness\":return 2 # sad\n",
    "    elif label == \"enthusiasm\":return 3 # happy\n",
    "    elif label == \"neutral\":return 0 # neutral\n",
    "    elif label == \"worry\":return 4 # sad\n",
    "    elif label == \"surprise\":return 5 # happy\n",
    "    elif label == \"love\":return 6 # happy\n",
    "    elif label == \"fun\":return 7 # happy\n",
    "    elif label == \"hate\":return 8\n",
    "    elif label == \"happiness\":return 9 # happy\n",
    "    elif label == \"boredom\":return 10 # neutral\n",
    "    elif label == \"relief\":return 11 # happy\n",
    "    elif label == \"anger\":return 12"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dataset shape: (40000, 4)\n",
      "empty : @tiffanylue i know  i was listenin to bad habit earlier and i started freakin at his part =[\n"
     ]
    }
   ],
   "source": [
    "data_train = pd.read_csv('dataset/data/text_emotion.csv', sep=',')\n",
    "print(\"Dataset shape:\",data_train.shape)\n",
    "print(data_train.sentiment[0],\":\",data_train.content[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Progress:  100 \n",
      "Complete!\n"
     ]
    }
   ],
   "source": [
    "dataWriter = csv.writer(open('data.csv', 'w'), delimiter=',',lineterminator=\"\\n\")\n",
    "\n",
    "total = 40000\n",
    "for i in range(40000):\n",
    "    #print(\"Progress: \",round(i/total*100,2),\"   \",end=\"\\r\")\n",
    "    tweet= clean_tweet(data_train.content[i])\n",
    "    #tweet = remove_stopwords(tweet.split())\n",
    "    dataWriter.writerow([tweet, str(vectorise_label(data_train.sentiment[i]))])\n",
    "    #sys.stdout.write(\"\\033[F\")\n",
    "    \n",
    "print(\"Progress: \",100,\"\\nComplete!\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "count = 0\n",
    "with open('data.csv') as csvfile:\n",
    "    readCSV = csv.reader(csvfile, delimiter=',')\n",
    "    for row in readCSV:\n",
    "        count+=1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "40000"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "count"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
